{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ed649c19",
   "metadata": {},
   "source": [
    "# Tabula Data Loader\n",
    "\n",
    "This file contains instructions for creating scanpy anndata versions of:\n",
    "    \n",
    "    \n",
    "|Dataset|Paper|\n",
    "|-------|-----|\n",
    "|Tabula Sapiens|https://www.science.org/stoken/author-tokens/ST-495/full|\n",
    "|Tabula Microcebus|https://www.biorxiv.org/content/10.1101/2021.12.12.469460v1|\n",
    "|Tabula Muris|https://www.nature.com/articles/s41586-018-0590-4|"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fde35987",
   "metadata": {},
   "source": [
    "# Downloads\n",
    "\n",
    "Download Sapiens data from https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5\n",
    "\n",
    "Download Microcebus data from https://figshare.com/articles/dataset/Tabula_Microcebus_v1_0/14468196?file=31777475\n",
    "\n",
    "Download Muris data from https://figshare.com/articles/dataset/Single-cell_RNA-seq_data_from_microfluidic_emulsion_v2_/5968960/2\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff6a5a77",
   "metadata": {},
   "source": [
    "### Sapiens (~15gb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "450f1fed",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 12.2G  100 12.2G    0     0  19.0M      0  0:10:58  0:10:58 --:--:-- 27.5M5  0:15:29 14.2M     0  0:12:42  0:05:09  0:07:33 26.4M\n"
     ]
    }
   ],
   "source": [
    "!curl -o local.h5ad \"https://corpora-data-prod.s3.amazonaws.com/fab1d235-d9fe-4028-81db-029983860486/local.h5ad?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIATLYQ5N5XZNDJPEGL%2F20221114%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20221114T221256Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEOb%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIGUYH1BwUeJ8qU9taeb2hCvi%2F0dTEHS37xMN6Q39Ta6sAiEA%2FMiR4YsqsRnYF2z%2FDAGKzWL%2B0bJSukfpgGStcLYjC%2FYq9AMI3%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARABGgwyMzE0MjY4NDY1NzUiDAwv4dxIReZ9v1Oo%2FyrIA6IjTpSaQQ4gDz%2B1C1Dj7lyFVvrL7utErmjWxPlBUsu4If7vNUilOFGbbV61JjfGL6jQ%2BzldF0no31GnuUTkaRu911YQQ4y0jA2shO8eNBIClJFAcI1pZkL0R9V%2FZ5YVfmC6mejtP3ypZr2XJbh%2Bhvt2j5K40woSB03kq4n5p7LYnog23xKu32FI%2FSYhk8ClLExlzdORK%2BWjruGOvdKaIhlKWzg0Z7NA2J0Luiny%2BmFyxdT9Htb%2Bf5PxoaQSl7sej9EX6B8SIRdyhzaDYPEqQpbJMFPyrdVm4JP0%2FH3RCvpOzfCYveFLGSp8TtWDU0dg6%2FqyAwgbgwsiEtD75hRtIINs%2ByRAby0M052UuP5G%2F2yNMo%2BbylpO7rpXckVOKNWS8MO9RLFow8gC5TjXx9MSAMcwL1ZBDfAScSSTWDZ6ED0LyjK10wWHHUlNXPR33u1GVkoJ7MLpfNyvh9nMsoqRGFXQ0KwcJQTjwJTubkyFwh0CILWms9szj10wZ3PUuReWjUTdRLdBX%2FP8WFo9uxPDTN7%2FxZlPqzx4NHTdRTSIBJWqPos7xVtNC2nenXBHWd4e6Aj3vVddtdqNMEo%2BKuhvamPafbX1zMErHTCo58qbBjqlAUS2BiVHW12a1oLKoIgB5xwQacskrdKuwjhcSy8i%2BjwOX%2FzfbvcrEmWkPeZ2eo%2FTu%2BmjaeM0r3CuqMYtz5%2BmV%2BuIK8OCcJd08WmwurdrEWUEyh1X4GePGo7XDouj7Kxd4GJJqznF9ZR%2BhJ3wncEcL2lEYoN84PpL2apBY8Gud4SHN2LLmN%2FPxTIQfN35OfI%2FxVllKULZ1Kqq7fuT792jivEp8DeQUA%3D%3D&X-Amz-Signature=ca0c9a3590f8a1a5826c01737ec1e8875ce0c107055fa6536102ae7787224dc8\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "94b568e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mv local.h5ad ./data/sapiens.h5ad"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "96d4daa1",
   "metadata": {},
   "source": [
    "### Microcebus (~9 GB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "59e743db",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING: combining -O with -r or -p will mean that all downloaded content\n",
      "will be placed in the single file you specified.\n",
      "\n",
      "--2022-11-14 12:40:11--  https://figshare.com/ndownloader/files/31777475\n",
      "Resolving figshare.com (figshare.com)... 34.252.180.148, 34.250.174.243, 2a05:d018:1f4:d003:64d9:8f4f:2f30:52f7, ...\n",
      "Connecting to figshare.com (figshare.com)|34.252.180.148|:443... connected.\n",
      "HTTP request sent, awaiting response... 302 Found\n",
      "Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/31777475/LCA_complete_wRaw_toPublish.h5ad?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20221114/eu-west-1/s3/aws4_request&X-Amz-Date=20221114T204011Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=7739e52b24fa0be35eff1268e3dd1d157c3907486515a9ac9ddcb492a7ec2f89 [following]\n",
      "--2022-11-14 12:40:11--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/31777475/LCA_complete_wRaw_toPublish.h5ad?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20221114/eu-west-1/s3/aws4_request&X-Amz-Date=20221114T204011Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=7739e52b24fa0be35eff1268e3dd1d157c3907486515a9ac9ddcb492a7ec2f89\n",
      "Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.60.203, 52.218.44.24, 52.218.1.67, ...\n",
      "Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.60.203|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 9001157888 (8.4G) [application/octet-stream]\n",
      "Saving to: ‘./data/mouse_lemur.h5ad’\n",
      "\n",
      "./data/mouse_lemur. 100%[===================>]   8.38G  9.81MB/s    in 18m 44s \n",
      "\n",
      "2022-11-14 12:58:56 (7.64 MB/s) - ‘./data/mouse_lemur.h5ad’ saved [9001157888/9001157888]\n",
      "\n",
      "FINISHED --2022-11-14 12:58:57--\n",
      "Total wall clock time: 18m 46s\n",
      "Downloaded: 1 files, 8.4G in 18m 44s (7.64 MB/s)\n"
     ]
    }
   ],
   "source": [
    "!wget -r \"https://figshare.com/ndownloader/files/31777475\" -O ./data/mouse_lemur.h5ad"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03e7134e",
   "metadata": {},
   "source": [
    "### Muris (~ 1GB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "affbcde2",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING: combining -O with -r or -p will mean that all downloaded content\n",
      "will be placed in the single file you specified.\n",
      "\n",
      "--2022-11-14 13:04:12--  https://figshare.com/ndownloader/files/10700167\n",
      "Resolving figshare.com (figshare.com)... 34.250.174.243, 34.252.180.148, 2a05:d018:1f4:d003:64d9:8f4f:2f30:52f7, ...\n",
      "Connecting to figshare.com (figshare.com)|34.250.174.243|:443... connected.\n",
      "HTTP request sent, awaiting response... 302 Found\n",
      "Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/10700167/droplet.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20221114/eu-west-1/s3/aws4_request&X-Amz-Date=20221114T210413Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=b4dd64b58b58a58ea7b1a9831292b6a959b4981fb3ec9e14ba84e9be392517d8 [following]\n",
      "--2022-11-14 13:04:13--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/10700167/droplet.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20221114/eu-west-1/s3/aws4_request&X-Amz-Date=20221114T210413Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=b4dd64b58b58a58ea7b1a9831292b6a959b4981fb3ec9e14ba84e9be392517d8\n",
      "Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.52.100, 52.92.2.200, 52.92.33.96, ...\n",
      "Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.52.100|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 392670694 (374M) [binary/octet-stream]\n",
      "Saving to: ‘./data/muris.zip’\n",
      "\n",
      "./data/muris.zip    100%[===================>] 374.48M  4.23MB/s    in 64s     \n",
      "\n",
      "2022-11-14 13:05:18 (5.81 MB/s) - ‘./data/muris.zip’ saved [392670694/392670694]\n",
      "\n",
      "FINISHED --2022-11-14 13:05:18--\n",
      "Total wall clock time: 1m 6s\n",
      "Downloaded: 1 files, 374M in 1m 4s (5.81 MB/s)\n",
      "WARNING: combining -O with -r or -p will mean that all downloaded content\n",
      "will be placed in the single file you specified.\n",
      "\n",
      "--2022-11-14 13:05:20--  https://figshare.com/ndownloader/files/10881902\n",
      "Resolving figshare.com (figshare.com)... 34.252.180.148, 34.250.174.243, 2a05:d018:1f4:d003:64d9:8f4f:2f30:52f7, ...\n",
      "Connecting to figshare.com (figshare.com)|34.252.180.148|:443... connected.\n",
      "HTTP request sent, awaiting response... 302 Found\n",
      "Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/10881902/annotations_droplet.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20221114/eu-west-1/s3/aws4_request&X-Amz-Date=20221114T210520Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=35505ecba6dd87313d1a06bda616c17bffaf5a0fe998a4d006a588a9891ffb8e [following]\n",
      "--2022-11-14 13:05:20--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/10881902/annotations_droplet.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20221114/eu-west-1/s3/aws4_request&X-Amz-Date=20221114T210520Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=35505ecba6dd87313d1a06bda616c17bffaf5a0fe998a4d006a588a9891ffb8e\n",
      "Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.30.83, 52.218.28.19, 52.218.46.80, ...\n",
      "Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.30.83|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 7623577 (7.3M) [binary/octet-stream]\n",
      "Saving to: ‘./data/muris_annotations_droplet.csv’\n",
      "\n",
      "./data/muris_annota 100%[===================>]   7.27M  3.41MB/s    in 2.1s    \n",
      "\n",
      "2022-11-14 13:05:23 (3.41 MB/s) - ‘./data/muris_annotations_droplet.csv’ saved [7623577/7623577]\n",
      "\n",
      "FINISHED --2022-11-14 13:05:23--\n",
      "Total wall clock time: 3.5s\n",
      "Downloaded: 1 files, 7.3M in 2.1s (3.41 MB/s)\n"
     ]
    }
   ],
   "source": [
    "!wget -r \"https://figshare.com/ndownloader/files/10700167\" -O ./data/muris.zip\n",
    "!wget -r \"https://figshare.com/ndownloader/files/10881902\" -O ./data/muris_annotations_droplet.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "e2cf03ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Archive:  ./data/muris.zip\n",
      "   creating: droplet/\n",
      "  inflating: droplet/.DS_Store       \n",
      "   creating: __MACOSX/droplet/\n",
      "  inflating: __MACOSX/droplet/._.DS_Store  \n",
      "   creating: droplet/Bladder-10X_P4_3/\n",
      "  inflating: droplet/Bladder-10X_P4_3/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Bladder-10X_P4_3/\n",
      "  inflating: __MACOSX/droplet/Bladder-10X_P4_3/._barcodes.tsv  \n",
      "  inflating: droplet/Bladder-10X_P4_3/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Bladder-10X_P4_3/._genes.tsv  \n",
      "  inflating: droplet/Bladder-10X_P4_3/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Bladder-10X_P4_3/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Bladder-10X_P4_3  \n",
      "   creating: droplet/Bladder-10X_P4_4/\n",
      "  inflating: droplet/Bladder-10X_P4_4/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Bladder-10X_P4_4/\n",
      "  inflating: __MACOSX/droplet/Bladder-10X_P4_4/._barcodes.tsv  \n",
      "  inflating: droplet/Bladder-10X_P4_4/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Bladder-10X_P4_4/._genes.tsv  \n",
      "  inflating: droplet/Bladder-10X_P4_4/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Bladder-10X_P4_4/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Bladder-10X_P4_4  \n",
      "   creating: droplet/Bladder-10X_P7_7/\n",
      "  inflating: droplet/Bladder-10X_P7_7/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Bladder-10X_P7_7/\n",
      "  inflating: __MACOSX/droplet/Bladder-10X_P7_7/._barcodes.tsv  \n",
      "  inflating: droplet/Bladder-10X_P7_7/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Bladder-10X_P7_7/._genes.tsv  \n",
      "  inflating: droplet/Bladder-10X_P7_7/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Bladder-10X_P7_7/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Bladder-10X_P7_7  \n",
      "   creating: droplet/Heart_and_Aorta-10X_P7_4/\n",
      "  inflating: droplet/Heart_and_Aorta-10X_P7_4/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Heart_and_Aorta-10X_P7_4/\n",
      "  inflating: __MACOSX/droplet/Heart_and_Aorta-10X_P7_4/._barcodes.tsv  \n",
      "  inflating: droplet/Heart_and_Aorta-10X_P7_4/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Heart_and_Aorta-10X_P7_4/._genes.tsv  \n",
      "  inflating: droplet/Heart_and_Aorta-10X_P7_4/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Heart_and_Aorta-10X_P7_4/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Heart_and_Aorta-10X_P7_4  \n",
      "   creating: droplet/Kidney-10X_P4_5/\n",
      "  inflating: droplet/Kidney-10X_P4_5/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Kidney-10X_P4_5/\n",
      "  inflating: __MACOSX/droplet/Kidney-10X_P4_5/._barcodes.tsv  \n",
      "  inflating: droplet/Kidney-10X_P4_5/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Kidney-10X_P4_5/._genes.tsv  \n",
      "  inflating: droplet/Kidney-10X_P4_5/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Kidney-10X_P4_5/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Kidney-10X_P4_5  \n",
      "   creating: droplet/Kidney-10X_P4_6/\n",
      "  inflating: droplet/Kidney-10X_P4_6/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Kidney-10X_P4_6/\n",
      "  inflating: __MACOSX/droplet/Kidney-10X_P4_6/._barcodes.tsv  \n",
      "  inflating: droplet/Kidney-10X_P4_6/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Kidney-10X_P4_6/._genes.tsv  \n",
      "  inflating: droplet/Kidney-10X_P4_6/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Kidney-10X_P4_6/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Kidney-10X_P4_6  \n",
      "   creating: droplet/Kidney-10X_P7_5/\n",
      "  inflating: droplet/Kidney-10X_P7_5/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Kidney-10X_P7_5/\n",
      "  inflating: __MACOSX/droplet/Kidney-10X_P7_5/._barcodes.tsv  \n",
      "  inflating: droplet/Kidney-10X_P7_5/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Kidney-10X_P7_5/._genes.tsv  \n",
      "  inflating: droplet/Kidney-10X_P7_5/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Kidney-10X_P7_5/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Kidney-10X_P7_5  \n",
      "   creating: droplet/Limb_Muscle-10X_P7_14/\n",
      "  inflating: droplet/Limb_Muscle-10X_P7_14/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Limb_Muscle-10X_P7_14/\n",
      "  inflating: __MACOSX/droplet/Limb_Muscle-10X_P7_14/._barcodes.tsv  \n",
      "  inflating: droplet/Limb_Muscle-10X_P7_14/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Limb_Muscle-10X_P7_14/._genes.tsv  \n",
      "  inflating: droplet/Limb_Muscle-10X_P7_14/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Limb_Muscle-10X_P7_14/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Limb_Muscle-10X_P7_14  \n",
      "   creating: droplet/Limb_Muscle-10X_P7_15/\n",
      "  inflating: droplet/Limb_Muscle-10X_P7_15/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Limb_Muscle-10X_P7_15/\n",
      "  inflating: __MACOSX/droplet/Limb_Muscle-10X_P7_15/._barcodes.tsv  \n",
      "  inflating: droplet/Limb_Muscle-10X_P7_15/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Limb_Muscle-10X_P7_15/._genes.tsv  \n",
      "  inflating: droplet/Limb_Muscle-10X_P7_15/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Limb_Muscle-10X_P7_15/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Limb_Muscle-10X_P7_15  \n",
      "   creating: droplet/Liver-10X_P4_2/\n",
      "  inflating: droplet/Liver-10X_P4_2/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Liver-10X_P4_2/\n",
      "  inflating: __MACOSX/droplet/Liver-10X_P4_2/._barcodes.tsv  \n",
      "  inflating: droplet/Liver-10X_P4_2/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Liver-10X_P4_2/._genes.tsv  \n",
      "  inflating: droplet/Liver-10X_P4_2/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Liver-10X_P4_2/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Liver-10X_P4_2  \n",
      "   creating: droplet/Liver-10X_P7_0/\n",
      "  inflating: droplet/Liver-10X_P7_0/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Liver-10X_P7_0/\n",
      "  inflating: __MACOSX/droplet/Liver-10X_P7_0/._barcodes.tsv  \n",
      "  inflating: droplet/Liver-10X_P7_0/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Liver-10X_P7_0/._genes.tsv  \n",
      "  inflating: droplet/Liver-10X_P7_0/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Liver-10X_P7_0/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Liver-10X_P7_0  \n",
      "   creating: droplet/Liver-10X_P7_1/\n",
      "  inflating: droplet/Liver-10X_P7_1/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Liver-10X_P7_1/\n",
      "  inflating: __MACOSX/droplet/Liver-10X_P7_1/._barcodes.tsv  \n",
      "  inflating: droplet/Liver-10X_P7_1/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Liver-10X_P7_1/._genes.tsv  \n",
      "  inflating: droplet/Liver-10X_P7_1/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Liver-10X_P7_1/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Liver-10X_P7_1  \n",
      "   creating: droplet/Lung-10X_P7_8/\n",
      "  inflating: droplet/Lung-10X_P7_8/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Lung-10X_P7_8/\n",
      "  inflating: __MACOSX/droplet/Lung-10X_P7_8/._barcodes.tsv  \n",
      "  inflating: droplet/Lung-10X_P7_8/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Lung-10X_P7_8/._genes.tsv  \n",
      "  inflating: droplet/Lung-10X_P7_8/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Lung-10X_P7_8/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Lung-10X_P7_8  \n",
      "   creating: droplet/Lung-10X_P7_9/\n",
      "  inflating: droplet/Lung-10X_P7_9/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Lung-10X_P7_9/\n",
      "  inflating: __MACOSX/droplet/Lung-10X_P7_9/._barcodes.tsv  \n",
      "  inflating: droplet/Lung-10X_P7_9/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Lung-10X_P7_9/._genes.tsv  \n",
      "  inflating: droplet/Lung-10X_P7_9/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Lung-10X_P7_9/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Lung-10X_P7_9  \n",
      "   creating: droplet/Lung-10X_P8_12/\n",
      "  inflating: droplet/Lung-10X_P8_12/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Lung-10X_P8_12/\n",
      "  inflating: __MACOSX/droplet/Lung-10X_P8_12/._barcodes.tsv  \n",
      "  inflating: droplet/Lung-10X_P8_12/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Lung-10X_P8_12/._genes.tsv  \n",
      "  inflating: droplet/Lung-10X_P8_12/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Lung-10X_P8_12/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Lung-10X_P8_12  \n",
      "   creating: droplet/Lung-10X_P8_13/\n",
      "  inflating: droplet/Lung-10X_P8_13/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Lung-10X_P8_13/\n",
      "  inflating: __MACOSX/droplet/Lung-10X_P8_13/._barcodes.tsv  \n",
      "  inflating: droplet/Lung-10X_P8_13/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Lung-10X_P8_13/._genes.tsv  \n",
      "  inflating: droplet/Lung-10X_P8_13/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Lung-10X_P8_13/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Lung-10X_P8_13  \n",
      "   creating: droplet/Mammary_Gland-10X_P7_12/\n",
      "  inflating: droplet/Mammary_Gland-10X_P7_12/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Mammary_Gland-10X_P7_12/\n",
      "  inflating: __MACOSX/droplet/Mammary_Gland-10X_P7_12/._barcodes.tsv  \n",
      "  inflating: droplet/Mammary_Gland-10X_P7_12/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Mammary_Gland-10X_P7_12/._genes.tsv  \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  inflating: droplet/Mammary_Gland-10X_P7_12/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Mammary_Gland-10X_P7_12/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Mammary_Gland-10X_P7_12  \n",
      "   creating: droplet/Mammary_Gland-10X_P7_13/\n",
      "  inflating: droplet/Mammary_Gland-10X_P7_13/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Mammary_Gland-10X_P7_13/\n",
      "  inflating: __MACOSX/droplet/Mammary_Gland-10X_P7_13/._barcodes.tsv  \n",
      "  inflating: droplet/Mammary_Gland-10X_P7_13/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Mammary_Gland-10X_P7_13/._genes.tsv  \n",
      "  inflating: droplet/Mammary_Gland-10X_P7_13/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Mammary_Gland-10X_P7_13/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Mammary_Gland-10X_P7_13  \n",
      "   creating: droplet/Marrow-10X_P7_2/\n",
      "  inflating: droplet/Marrow-10X_P7_2/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Marrow-10X_P7_2/\n",
      "  inflating: __MACOSX/droplet/Marrow-10X_P7_2/._barcodes.tsv  \n",
      "  inflating: droplet/Marrow-10X_P7_2/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Marrow-10X_P7_2/._genes.tsv  \n",
      "  inflating: droplet/Marrow-10X_P7_2/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Marrow-10X_P7_2/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Marrow-10X_P7_2  \n",
      "   creating: droplet/Marrow-10X_P7_3/\n",
      "  inflating: droplet/Marrow-10X_P7_3/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Marrow-10X_P7_3/\n",
      "  inflating: __MACOSX/droplet/Marrow-10X_P7_3/._barcodes.tsv  \n",
      "  inflating: droplet/Marrow-10X_P7_3/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Marrow-10X_P7_3/._genes.tsv  \n",
      "  inflating: droplet/Marrow-10X_P7_3/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Marrow-10X_P7_3/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Marrow-10X_P7_3  \n",
      "   creating: droplet/Spleen-10X_P4_7/\n",
      "  inflating: droplet/Spleen-10X_P4_7/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Spleen-10X_P4_7/\n",
      "  inflating: __MACOSX/droplet/Spleen-10X_P4_7/._barcodes.tsv  \n",
      "  inflating: droplet/Spleen-10X_P4_7/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Spleen-10X_P4_7/._genes.tsv  \n",
      "  inflating: droplet/Spleen-10X_P4_7/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Spleen-10X_P4_7/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Spleen-10X_P4_7  \n",
      "   creating: droplet/Spleen-10X_P7_6/\n",
      "  inflating: droplet/Spleen-10X_P7_6/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Spleen-10X_P7_6/\n",
      "  inflating: __MACOSX/droplet/Spleen-10X_P7_6/._barcodes.tsv  \n",
      "  inflating: droplet/Spleen-10X_P7_6/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Spleen-10X_P7_6/._genes.tsv  \n",
      "  inflating: droplet/Spleen-10X_P7_6/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Spleen-10X_P7_6/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Spleen-10X_P7_6  \n",
      "   creating: droplet/Thymus-10X_P7_11/\n",
      "  inflating: droplet/Thymus-10X_P7_11/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Thymus-10X_P7_11/\n",
      "  inflating: __MACOSX/droplet/Thymus-10X_P7_11/._barcodes.tsv  \n",
      "  inflating: droplet/Thymus-10X_P7_11/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Thymus-10X_P7_11/._genes.tsv  \n",
      "  inflating: droplet/Thymus-10X_P7_11/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Thymus-10X_P7_11/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Thymus-10X_P7_11  \n",
      "   creating: droplet/Tongue-10X_P4_0/\n",
      "  inflating: droplet/Tongue-10X_P4_0/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Tongue-10X_P4_0/\n",
      "  inflating: __MACOSX/droplet/Tongue-10X_P4_0/._barcodes.tsv  \n",
      "  inflating: droplet/Tongue-10X_P4_0/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Tongue-10X_P4_0/._genes.tsv  \n",
      "  inflating: droplet/Tongue-10X_P4_0/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Tongue-10X_P4_0/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Tongue-10X_P4_0  \n",
      "   creating: droplet/Tongue-10X_P4_1/\n",
      "  inflating: droplet/Tongue-10X_P4_1/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Tongue-10X_P4_1/\n",
      "  inflating: __MACOSX/droplet/Tongue-10X_P4_1/._barcodes.tsv  \n",
      "  inflating: droplet/Tongue-10X_P4_1/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Tongue-10X_P4_1/._genes.tsv  \n",
      "  inflating: droplet/Tongue-10X_P4_1/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Tongue-10X_P4_1/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Tongue-10X_P4_1  \n",
      "   creating: droplet/Tongue-10X_P7_10/\n",
      "  inflating: droplet/Tongue-10X_P7_10/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Tongue-10X_P7_10/\n",
      "  inflating: __MACOSX/droplet/Tongue-10X_P7_10/._barcodes.tsv  \n",
      "  inflating: droplet/Tongue-10X_P7_10/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Tongue-10X_P7_10/._genes.tsv  \n",
      "  inflating: droplet/Tongue-10X_P7_10/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Tongue-10X_P7_10/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Tongue-10X_P7_10  \n",
      "   creating: droplet/Trachea-10X_P8_14/\n",
      "  inflating: droplet/Trachea-10X_P8_14/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Trachea-10X_P8_14/\n",
      "  inflating: __MACOSX/droplet/Trachea-10X_P8_14/._barcodes.tsv  \n",
      "  inflating: droplet/Trachea-10X_P8_14/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Trachea-10X_P8_14/._genes.tsv  \n",
      "  inflating: droplet/Trachea-10X_P8_14/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Trachea-10X_P8_14/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Trachea-10X_P8_14  \n",
      "   creating: droplet/Trachea-10X_P8_15/\n",
      "  inflating: droplet/Trachea-10X_P8_15/barcodes.tsv  \n",
      "   creating: __MACOSX/droplet/Trachea-10X_P8_15/\n",
      "  inflating: __MACOSX/droplet/Trachea-10X_P8_15/._barcodes.tsv  \n",
      "  inflating: droplet/Trachea-10X_P8_15/genes.tsv  \n",
      "  inflating: __MACOSX/droplet/Trachea-10X_P8_15/._genes.tsv  \n",
      "  inflating: droplet/Trachea-10X_P8_15/matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/Trachea-10X_P8_15/._matrix.mtx  \n",
      "  inflating: __MACOSX/droplet/._Trachea-10X_P8_15  \n",
      "  inflating: __MACOSX/._droplet      \n"
     ]
    }
   ],
   "source": [
    "!unzip ./data/muris.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "322c2fc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mv droplet ./data/droplet"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ba822c40",
   "metadata": {},
   "source": [
    "# Create the AnnDatas for Alignment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "b53dac08",
   "metadata": {},
   "outputs": [],
   "source": [
    "import scanpy as sc\n",
    "from glob import glob\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f412c830",
   "metadata": {},
   "source": [
    "# Coarsen / Map Tissues"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "2a045031",
   "metadata": {},
   "outputs": [],
   "source": [
    "# For the coarse alignment pictured in the UMAP in figure 1, use these settings\n",
    "cell_type_number_filter = 350\n",
    "tissue_subset = True\n",
    "ten_x_subset = True\n",
    "\n",
    "# For a fine tissue based alignment, use these settings\n",
    "#cell_type_number_filter = 0\n",
    "#tissue_subset = False\n",
    "#ten_x_subset = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "f49b18be",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bone_marrow',\n",
       " 'muscle',\n",
       " 'pancreas',\n",
       " 'spleen',\n",
       " 'thymus',\n",
       " 'trachea',\n",
       " 'bladder',\n",
       " 'lung',\n",
       " 'kidney']"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# list of tissues\n",
    "all_tissues = [\"liver\", \"trachea\", \"tounge\", \"spleen\", \n",
    "               \"skin\", \"bladder\", \"bone_marrow\",\n",
    "               \"heart_and_aorta\", \"lung\", \"blood\",\n",
    "               \"mammary\", \"bone\", \"intestine\", \"uterus\",\n",
    "               \"fat\", \"kidney\", \"pancreas\", \"eye\", \"prostate\", \n",
    "               \"muscle\", \"thymus\", \"brain\", \"colon\", \"endocrine\",  \"testes\",\n",
    "               \"lymph_node\", \"salivary_gland\"\n",
    "              ]\n",
    "\n",
    "human_tissue_map = {\n",
    "    \"Liver\": \"liver\",\n",
    "    \"Trachea\": \"trachea\",\n",
    "    \"Blood\": \"blood\",\n",
    "    \"Lymph_Node\": \"lymph_node\",\n",
    "    \"Salivary_Gland\": \"salivary_gland\",\n",
    "    \"Spleen\": \"spleen\",\n",
    "    \"Tongue\": \"tounge\",\n",
    "    \"Mammary\": \"mammary\",\n",
    "    \"Uterus\": \"uterus\",\n",
    "    \"Eye\": \"eye\",\n",
    "    \"Fat\": \"fat\",\n",
    "    \"Skin\": \"skin\",\n",
    "    \"Bone_Marrow\": \"bone_marrow\",\n",
    "    \"Heart\": \"heart_and_aorta\",\n",
    "    \"Pancreas\": \"pancreas\",\n",
    "    \"Prostate\": \"prostate\",\n",
    "    \"Muscle\": \"muscle\",\n",
    "    \"Thymus\": \"thymus\",\n",
    "    \"Bladder\": \"bladder\",\n",
    "    \"Large_Intestine\": \"intestine\",\n",
    "    \"Lung\": \"lung\",\n",
    "    \"Small_Intestine\": \"intestine\",\n",
    "    \"Vasculature\": \"muscle\",\n",
    "    \"Kidney\": \"kidney\"\n",
    "}\n",
    "\n",
    "\n",
    "mouse_tissue_map = {\n",
    "    \"Tongue\": \"tounge\",\n",
    "    \"Heart_and_Aorta\": \"heart_and_aorta\",\n",
    "    \"Marrow\": \"bone_marrow\",\n",
    "    \"Mammary_Gland\": \"mammary\",\n",
    "    \"Fat\": \"fat\",\n",
    "    \"Kidney\": \"kidney\",\n",
    "    \"Liver\": \"liver\",\n",
    "    \"Lung\": \"lung\",\n",
    "    \"Limb_Muscle\": \"muscle\",\n",
    "    \"Pancreas\": \"pancreas\",\n",
    "    \"Spleen\": \"spleen\",\n",
    "    \"Thymus\": \"thymus\",\n",
    "    \"Bladder\": \"bladder\",\n",
    "    \"Skin\": \"skin\",\n",
    "    \"Large_Intestine\": \"intestine\",\n",
    "    \"Trachea\": \"trachea\"\n",
    "}\n",
    "\n",
    "\n",
    "lemur_tissue_map = {\n",
    "    \"Testes\": \"testes\",\n",
    "    \"Heart\": \"heart_and_aorta\",\n",
    "    \"Liver\": \"liver\",\n",
    "    \"Thymus\": \"thymus\",\n",
    "    \"Eye_retina\": \"eye\",\n",
    "    \"Brain_cortex\": \"brain\",\n",
    "    \"Brainstem\": \"brain\",\n",
    "    \"Pancreas\": \"pancreas\",\n",
    "    \"Small_intestine\": \"intestine\",\n",
    "    \"Lung\": \"lung\",\n",
    "    \"Kidney\": \"kidney\",\n",
    "    \"Tongue\": \"tounge\",\n",
    "    \"Diaphragm\": \"muscle\",\n",
    "    \"Limb_muscle\": \"muscle\",\n",
    "    \"Spleen\": \"spleen\",\n",
    "    \"Blood\": \"blood\",\n",
    "    \"Bone\": \"bone\",\n",
    "    \"Bone_marrow\": \"bone_marrow\",\n",
    "    \"Bladder\": \"bladder\",\n",
    "    \"Skin\": \"skin\",\n",
    "    \"Colon\": \"colon\",\n",
    "    \"Aorta\": \"heart_and_aorta\",\n",
    "    \"Hypothalamus_Pituitary\": \"endocrine\",\n",
    "    \"Mammary_gland\": \"mammary\",\n",
    "    \"Fat\": \"fat\",\n",
    "    \"Uterus\": \"uterus\",\n",
    "    \"Trachea\": \"trachea\"\n",
    "}\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "keep_tissues = [\"bone_marrow\", \"muscle\", \"pancreas\", \"spleen\", \"thymus\", \"trachea\", \"bladder\", \n",
    "                \"lung\", \"kidney\"] # _full\n",
    "                #\"heart_and_aorta\", \"intestine\", \"skin\", \"muscle\"] # full_more_tissues\n",
    "                # causes a weird bug such that HV genes doesn't work for lemur heart and aorta due to to few cells\n",
    "keep_tissues"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fad39422",
   "metadata": {},
   "source": [
    "# Coarsen Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "4005d332",
   "metadata": {},
   "outputs": [],
   "source": [
    "obo_loc = \"./data/cl.obo.txt\"\n",
    "with open(obo_loc, \"r\", encoding='utf-8') as f:\n",
    "    obo = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "b10eb213",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([   17,    33,    45, ..., 22772, 22782, 22792])"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obo_term_idxs = np.where([o.startswith('[Term]') for o in obo])[0]\n",
    "obo_term_idxs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "d0bf80db",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_group(lines):\n",
    "    term_id = lines[[o.startswith(\"id:\") for o in lines]][0].strip().split()[1]\n",
    "    try:\n",
    "        is_a = lines[[o.startswith(\"is_a:\") for o in lines]][0].strip().split()[1]\n",
    "    except:\n",
    "        is_a = None\n",
    "    name = lines[[o.startswith(\"name:\") for o in lines]][0].strip().split(\"name:\")[1].strip()\n",
    "    return name, is_a, term_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "7aee0821",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_rows = []\n",
    "for i in range(1, len(obo_term_idxs) - 1):\n",
    "    ls = np.array(obo[obo_term_idxs[i]:obo_term_idxs[i+1]])\n",
    "    r = parse_group(ls)\n",
    "    all_rows.append(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "38a40473",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>is_a</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CL:0000001</th>\n",
       "      <td>primary cultured cell</td>\n",
       "      <td>CL:0000010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000002</th>\n",
       "      <td>obsolete immortal cell line cell</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000003</th>\n",
       "      <td>native cell</td>\n",
       "      <td>CL:0000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000004</th>\n",
       "      <td>obsolete cell by organism</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000005</th>\n",
       "      <td>fibroblast neural crest derived</td>\n",
       "      <td>CL:0000057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000006</th>\n",
       "      <td>neuronal receptor cell</td>\n",
       "      <td>CL:0000101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000007</th>\n",
       "      <td>early embryonic cell</td>\n",
       "      <td>CL:0002321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000008</th>\n",
       "      <td>migratory cranial neural crest cell</td>\n",
       "      <td>CL:0000333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000009</th>\n",
       "      <td>obsolete fusiform initial</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000010</th>\n",
       "      <td>cultured cell</td>\n",
       "      <td>CL:0000578</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000011</th>\n",
       "      <td>migratory trunk neural crest cell</td>\n",
       "      <td>CL:0000333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000012</th>\n",
       "      <td>obsolete cell by class</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000013</th>\n",
       "      <td>obsolete dentine secreting cell</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000014</th>\n",
       "      <td>germ line stem cell</td>\n",
       "      <td>CL:0000034</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000015</th>\n",
       "      <td>male germ cell</td>\n",
       "      <td>CL:0000586</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000016</th>\n",
       "      <td>male germ line stem cell</td>\n",
       "      <td>CL:0000014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000017</th>\n",
       "      <td>spermatocyte</td>\n",
       "      <td>CL:0000015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000018</th>\n",
       "      <td>spermatid</td>\n",
       "      <td>CL:0000015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000019</th>\n",
       "      <td>sperm</td>\n",
       "      <td>CL:0000064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CL:0000020</th>\n",
       "      <td>spermatogonium</td>\n",
       "      <td>CL:0000015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           name        is_a\n",
       "id                                                         \n",
       "CL:0000001                primary cultured cell  CL:0000010\n",
       "CL:0000002     obsolete immortal cell line cell        None\n",
       "CL:0000003                          native cell  CL:0000000\n",
       "CL:0000004            obsolete cell by organism        None\n",
       "CL:0000005      fibroblast neural crest derived  CL:0000057\n",
       "CL:0000006               neuronal receptor cell  CL:0000101\n",
       "CL:0000007                 early embryonic cell  CL:0002321\n",
       "CL:0000008  migratory cranial neural crest cell  CL:0000333\n",
       "CL:0000009            obsolete fusiform initial        None\n",
       "CL:0000010                        cultured cell  CL:0000578\n",
       "CL:0000011    migratory trunk neural crest cell  CL:0000333\n",
       "CL:0000012               obsolete cell by class        None\n",
       "CL:0000013      obsolete dentine secreting cell        None\n",
       "CL:0000014                  germ line stem cell  CL:0000034\n",
       "CL:0000015                       male germ cell  CL:0000586\n",
       "CL:0000016             male germ line stem cell  CL:0000014\n",
       "CL:0000017                         spermatocyte  CL:0000015\n",
       "CL:0000018                            spermatid  CL:0000015\n",
       "CL:0000019                                sperm  CL:0000064\n",
       "CL:0000020                       spermatogonium  CL:0000015"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obo_tbl = pd.DataFrame(all_rows, columns=[\"name\", \"is_a\", \"id\"]).set_index(\"id\")\n",
    "obo_tbl.index = obo_tbl.index.astype(str)\n",
    "obo_tbl.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "967cdd68",
   "metadata": {},
   "outputs": [],
   "source": [
    "COARSE_MIN_CT = 4\n",
    "EXCLUDED_CTS = [1, 2, 3, 4, 255, 10, 548, 325, 2371, 630, 219, 11115, 473, 145, 62, 7010]\n",
    "def coarsen(cl_id):\n",
    "    if not int(cl_id.split(\":\")[1]) in EXCLUDED_CTS:\n",
    "        r = obo_tbl.loc[cl_id]\n",
    "        new_id = r[\"is_a\"]\n",
    "        new_name = obo_tbl.loc[new_id][\"name\"]\n",
    "        if int(new_id.split(\":\")[1]) in EXCLUDED_CTS:\n",
    "            # don't over coarsen\n",
    "            new_id = cl_id\n",
    "            new_name = obo_tbl.loc[cl_id][\"name\"]\n",
    "    else:\n",
    "        new_id = cl_id\n",
    "        new_name = obo_tbl.loc[cl_id][\"name\"]\n",
    "    return new_name, new_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "5158860d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def coarsen_labels(labs, max_level=2):\n",
    "    coarsened_ids_dict = {}\n",
    "    coarsened_names_dict = {}\n",
    "    for cl in labs.unique():\n",
    "        try:\n",
    "            new_name, new_id = coarsen(cl)\n",
    "            i = 0\n",
    "            cl_og = cl\n",
    "            while new_id != cl:\n",
    "                cl_store = new_id\n",
    "                new_name, new_id = coarsen(new_id) \n",
    "                cl = cl_store\n",
    "                i += 1\n",
    "                if i > max_level:\n",
    "                    break\n",
    "            coarsened_ids_dict[cl_og] = new_id\n",
    "            coarsened_names_dict[cl_og] = new_name\n",
    "        except:\n",
    "            next\n",
    "    return coarsened_ids_dict, coarsened_names_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89940456",
   "metadata": {},
   "source": [
    "# Sapiens Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "b45650cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "human = sc.read(\"./data/sapiens.h5ad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "8d7c414c",
   "metadata": {},
   "outputs": [],
   "source": [
    "human.X = human.layers[\"decontXcounts\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "f0cc9ba1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/lfs/ampere2/0/yanay/lib/python3.8/site-packages/anndata/_core/anndata.py:895: UserWarning: \n",
      "AnnData expects .var.index to contain strings, but got values like:\n",
      "    ['DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2']\n",
      "\n",
      "    Inferred to be: categorical\n",
      "\n",
      "  names = self._prep_dim_index(names, \"var\")\n"
     ]
    }
   ],
   "source": [
    "human.var_names = human.var[\"feature_name\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "85a586c3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Level 20: added {'phagocyte', 'embryonic cell', 'barrier cell', 'hematopoietic cell', 'pigment cell', 'ciliated cell', 'contractile cell', 'connective tissue cell', 'electrically active cell', 'epithelial cell', 'adventitial cell', 'fat cell', 'salivary gland cell', 'cell of skeletal muscle', 'stem cell', 'leukocyte', 'secretory cell', 'neural cell'}\n",
      "Level 19: added set()\n",
      "Level 18: added set()\n",
      "Level 17: added set()\n",
      "Level 16: added set()\n",
      "Level 15: added set()\n",
      "Level 14: added set()\n",
      "Level 13: added set()\n",
      "Level 12: added set()\n",
      "Level 11: added set()\n",
      "Level 10: added set()\n",
      "Level 9: added set()\n",
      "Level 8: added set()\n",
      "Level 7: added set()\n",
      "Level 6: added set()\n",
      "Level 5: added {'nongranular leukocyte'}\n",
      "Level 4: added {'lymphocyte', 'electrically responsive cell'}\n",
      "Level 3: added {'endo-epithelial cell', 'T cell', 'neuron', 'neuron associated cell'}\n",
      "Level 2: added {'muscle cell', 'glial cell', 'alpha-beta T cell', 'electrically responsive cell', 'respiratory epithelial cell', 'hematopoietic precursor cell', 'afferent neuron'}\n",
      "Level 1: added {'lining cell', 'lymphocyte of B lineage', 'sensory neuron', 'neuron', 'epithelial cell of lung', 'immature T cell', 'ciliated epithelial cell', 'squamous epithelial cell', 'neuron associated cell', 'glial cell (sensu Vertebrata)', 'non-striated muscle cell', 'columnar/cuboidal epithelial cell', 'somatic stem cell', 'epithelial cell of lower respiratory tract', 'innate lymphoid cell', 'mature alpha-beta T cell', 'hematopoietic lineage restricted progenitor cell'}\n"
     ]
    }
   ],
   "source": [
    "old_set = {}\n",
    "for level in np.arange(20, 0, -1):\n",
    "    coarsened_ids_dict, coarsened_names_dict = coarsen_labels(human.obs[\"cell_type_ontology_term_id\"], max_level=level)\n",
    "    new_set = set(coarsened_names_dict.values())\n",
    "    diff = new_set.difference(old_set)\n",
    "    print(f\"Level {level}: added {diff}\")\n",
    "    old_set = new_set\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "f928a115",
   "metadata": {},
   "outputs": [],
   "source": [
    "coarsened_ids_dict, coarsened_names_dict = coarsen_labels(human.obs[\"cell_type_ontology_term_id\"], max_level=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "40816d97",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'fibroblast of breast',\n",
       " 'intestinal crypt stem cell of large intestine',\n",
       " 'intestinal crypt stem cell of small intestine',\n",
       " 'intestinal tuft cell',\n",
       " 'paneth cell of colon',\n",
       " 'pulmonary ionocyte',\n",
       " 'transit amplifying cell of colon',\n",
       " 'transit amplifying cell of small intestine'}"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "not_mapped = {}\n",
    "for ctid, ctname in zip(human.obs[\"cell_type_ontology_term_id\"], human.obs[\"cell_type\"]):\n",
    "    if ctid not in coarsened_names_dict.keys():\n",
    "        not_mapped[ctname] = ctid\n",
    "set(not_mapped.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "9c2ffa6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "coarsened_names_dict = {**coarsened_names_dict, **{not_mapped['fibroblast of breast']:\"connective tissue cell\",\n",
    " not_mapped['intestinal crypt stem cell of large intestine']:\"stem cell\",\n",
    " not_mapped['intestinal crypt stem cell of small intestine']:\"stem cell\",\n",
    " not_mapped['intestinal tuft cell']:\"epithelial cell\",\n",
    " not_mapped['paneth cell of colon']:\"epithelial cell\",\n",
    " not_mapped['pulmonary ionocyte']:\"epithelial cell\",\n",
    " not_mapped['transit amplifying cell of colon']:\"stem cell\",\n",
    " not_mapped['transit amplifying cell of small intestine']:\"stem cell\"}}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "5d34b6d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "coarsened_ids = [coarsened_ids_dict.get(ctid, ctid) for ctid in human.obs[\"cell_type_ontology_term_id\"]]\n",
    "coarsened_names = [coarsened_names_dict.get(ctid, ctname) for ctid, ctname in zip(human.obs[\"cell_type_ontology_term_id\"], human.obs[\"cell_type\"])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "3539fac9",
   "metadata": {},
   "outputs": [],
   "source": [
    "human.obs[\"coarse_cell_id\"] = coarsened_ids\n",
    "human.obs[\"coarse_cell_type\"] = coarsened_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "324833f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/lfs/ampere2/0/yanay/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.\n",
      "  warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n"
     ]
    }
   ],
   "source": [
    "sc.pp.filter_genes(human, min_counts=500)\n",
    "sc.pp.filter_cells(human, min_counts=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "id": "cc34fde7",
   "metadata": {},
   "outputs": [],
   "source": [
    "if ten_x_subset:\n",
    "    human = human[human.obs[\"assay\"] == \"10x 3' v3\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "id": "3bcf80ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/user/21290/ipykernel_571139/3916522068.py:1: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  human.obs[\"tissue_type\"] = [human_tissue_map[t] for t in human.obs[\"tissue_in_publication\"]]\n"
     ]
    }
   ],
   "source": [
    "human.obs[\"tissue_type\"] = [human_tissue_map[t] for t in human.obs[\"tissue_in_publication\"]]\n",
    "if tissue_subset:\n",
    "    human = human[human.obs[\"tissue_type\"].isin(keep_tissues)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "39d1fe37",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "62\n"
     ]
    }
   ],
   "source": [
    "# filter to large cell types\n",
    "if cell_type_number_filter > 0:\n",
    "    human_keep_cell_types = human.obs[\"cell_ontology_class\"].value_counts()[np.where(human.obs[\"cell_ontology_class\"].value_counts() > cell_type_number_filter)[0]].index\n",
    "    human = human[human.obs[\"cell_ontology_class\"].isin(human_keep_cell_types)]\n",
    "\n",
    "    print(len(human_keep_cell_types))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "fee0961c",
   "metadata": {},
   "outputs": [],
   "source": [
    "human_subset = human\n",
    "human_subset.obs = human.obs[[\"coarse_cell_type\", \"cell_ontology_class\", \"tissue_type\"]]\n",
    "human_subset.obs[\"cell_type\"] = human.obs[\"cell_ontology_class\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "940e30f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "del human_subset.raw # annoying subset rule for anndatas, to work with SAMap Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "f00b32db",
   "metadata": {},
   "outputs": [],
   "source": [
    "human_subset.write(f\"./data/human_ct{cell_type_number_filter}_tissue{tissue_subset}_10x{ten_x_subset}.h5ad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "b8af010a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/dfs/project/cross-species/yanay/data/tabula/finished/human_ct350_tissueTrue_10xTrue.h5ad'"
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f\"/dfs/project/cross-species/yanay/data/tabula/finished/human_ct{cell_type_number_filter}_tissue{tissue_subset}_10x{ten_x_subset}.h5ad\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "a5bdd95b",
   "metadata": {},
   "outputs": [],
   "source": [
    "human = human_subset = None "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6057d27d",
   "metadata": {},
   "source": [
    "# Tabula Microcebus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "id": "3ca68766",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 244081 × 31509\n",
       "    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_name', 'cell_barcode_10x', 'sequencing_run_10x', 'channel_10x', 'possibly_contaminated_barcode_10x', 'method', 'individual', 'age', 'sex', 'tissue', 'tissue_system', 'tissue_order', 'subtissue', 'compartment_v1', 'cell_ontology_class_v1', 'free_annotation_v1', 'tissue__cell_ontology_class_v1', 'tissue__free_annotation_v1', 'mix_hybrid', 'low_quality', 'dendrogram_annotation_number', 'dendrogram_annotation_order', 'order__compartment_freeannotation_tissue', 'order__tissue_compartment_freeannotation', 'Mimu_168', 'Mimu_W03', 'Mimu_W04', 'Mimu_180ps', 'Mimu_191', 'Mimu_202', 'Mimu_208', 'Mimu_218', 'Mimu_229ps', 'Mimu_239ps', 'Mimu_249', 'Mimu_DMA', 'Mimu_DMB', 'Mimu_DPA', 'Mimu_DPB', 'Mimu_DQA', 'Mimu_DQB', 'Mimu_DRA', 'Mimu_DRB', 'MHC_C_I', 'MHC_NC_I', 'MHC_all_II', 'nMimu_168', 'nMimu_W03', 'nMimu_W04', 'nMimu_180ps', 'nMimu_191', 'nMimu_202', 'nMimu_208', 'nMimu_218', 'nMimu_229ps', 'nMimu_239ps', 'nMimu_249', 'nMimu_DMA', 'nMimu_DMB', 'nMimu_DPA', 'nMimu_DPB', 'nMimu_DQA', 'nMimu_DQB', 'nMimu_DRA', 'nMimu_DRB', 'nMHC_C_I', 'nMHC_NC_I', 'nMHC_all_II'\n",
       "    var: 'name', 'highly_variable'\n",
       "    uns: 'compartment_update_colors'\n",
       "    obsm: 'X_pca', 'X_umap'\n",
       "    layers: 'raw_counts'"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lemur = sc.read(\"./data/mouse_lemur.h5ad\")\n",
    "lemur"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "d3fda337",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur.X = lemur.layers[\"raw_counts\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "id": "ef0cbcf1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "145"
      ]
     },
     "execution_count": 142,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lemur_ontology_names = lemur.obs[\"cell_ontology_class_v1\"].unique()\n",
    "len(lemur_ontology_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "id": "80187cf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur_ontology_names_to_id = {}\n",
    "for lon in lemur_ontology_names:\n",
    "    r = obo_tbl[obo_tbl[\"name\"] == lon]\n",
    "    if r.shape[0] != 0:\n",
    "        lemur_ontology_names_to_id[lon] = obo_tbl[obo_tbl[\"name\"] == lon].index[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "id": "711f8569",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur_ct_ids = [lemur_ontology_names_to_id.get(ctname, \"na\") for ctname in lemur.obs[\"cell_ontology_class_v1\"]]\n",
    "lemur.obs[\"cell_ontology_id\"] = lemur_ct_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "id": "ebca2673",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Level 20: added {'phagocyte', 'hair follicle cell', 'barrier cell', 'hematopoietic cell', 'pigment cell', 'contractile cell', 'connective tissue cell', 'electrically active cell', 'epithelial cell', 'osteoblast', 'germ line cell', 'fat cell', 'cell of skeletal muscle', 'preosteoblast', 'stem cell', 'leukocyte', 'M cell of gut', 'neural cell', 'secretory cell', 'ciliated cell', 'kidney cell'}\n",
      "Level 19: added set()\n",
      "Level 18: added set()\n",
      "Level 17: added set()\n",
      "Level 16: added set()\n",
      "Level 15: added set()\n",
      "Level 14: added set()\n",
      "Level 13: added set()\n",
      "Level 12: added set()\n",
      "Level 11: added set()\n",
      "Level 10: added set()\n",
      "Level 9: added set()\n",
      "Level 8: added set()\n",
      "Level 7: added set()\n",
      "Level 6: added {'electrically responsive cell'}\n",
      "Level 5: added {'neuron'}\n",
      "Level 4: added {'kidney epithelial cell', 'afferent neuron'}\n",
      "Level 3: added {'sensory neuron', 'nongranular leukocyte', 'endo-epithelial cell', 'electrically responsive cell', 'epithelial cell of nephron'}\n",
      "Level 2: added {'neuron', 'muscle cell', 'squamous epithelial cell', 'neuron associated cell', 'neuronal receptor cell', 'kidney epithelial cell', 'respiratory epithelial cell', 'kidney tubule cell', 'lymphocyte'}\n",
      "Level 1: added {'lining cell', 'interneuron', 'epithelial cell of lung', 'glandular epithelial cell', 'glial cell', 'ciliated epithelial cell', 'T cell', 'epithelial cell of nephron', 'endo-epithelial cell', 'photoreceptor cell', 'stratified squamous epithelial cell', 'ecto-epithelial cell', 'epithelial cell of lower respiratory tract', 'blood vessel endothelial cell', 'striated muscle cell', 'nephron tubule epithelial cell'}\n"
     ]
    }
   ],
   "source": [
    "old_set = {}\n",
    "for level in np.arange(20, 0, -1):\n",
    "    coarsened_ids_dict, coarsened_names_dict = coarsen_labels(lemur.obs[\"cell_ontology_id\"], max_level=level)\n",
    "    new_set = set(coarsened_names_dict.values())\n",
    "    diff = new_set.difference(old_set)\n",
    "    print(f\"Level {level}: added {diff}\")\n",
    "    old_set = new_set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "id": "c7e22723",
   "metadata": {},
   "outputs": [],
   "source": [
    "coarsened_ids_dict, coarsened_names_dict = coarsen_labels(lemur.obs[\"cell_ontology_id\"], max_level=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "id": "c955fd22",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'cardiomyocyte',\n",
       " 'cell',\n",
       " 'gonadotroph',\n",
       " 'intestinal tuft cell',\n",
       " 'lactotroph',\n",
       " 'mesothelial cell of epicardium',\n",
       " 'pancreatic B cell',\n",
       " 'podocyte',\n",
       " 'unassigned'}"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "not_mapped = {}\n",
    "for ctid, ctname in zip(lemur.obs[\"cell_ontology_id\"], lemur.obs[\"cell_ontology_class_v1\"]):\n",
    "    if ctid not in coarsened_names_dict.keys():\n",
    "        not_mapped[ctname] = ctid\n",
    "set(not_mapped.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "id": "5c9040f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "coarsened_names_dict = {**coarsened_names_dict, \n",
    "                        **{  not_mapped['cardiomyocyte']: 'contractile cell',\n",
    "                             not_mapped['gonadotroph']: 'secretory cell',\n",
    "                             not_mapped['intestinal tuft cell']: 'epithelial cell',\n",
    "                             not_mapped['lactotroph']: 'secretory cell',\n",
    "                             not_mapped['mesothelial cell of epicardium']: '',\n",
    "                             not_mapped['pancreatic B cell']: 'secretory cell',\n",
    "                             not_mapped['podocyte']: 'secretory cell'}}\n",
    "                           \n",
    "to_remove = [\"cell\", \"unassigned\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "id": "9e9997b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "coarsened_ids = [coarsened_ids_dict.get(ctid, ctid) for ctid in lemur.obs[\"cell_ontology_id\"]]\n",
    "coarsened_names = [coarsened_names_dict.get(ctid, ctname) for ctid, ctname in zip(lemur.obs[\"cell_ontology_id\"], lemur.obs[\"cell_ontology_class_v1\"])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "id": "eda28d8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur.obs[\"coarse_cell_id\"] = coarsened_ids\n",
    "lemur.obs[\"coarse_cell_type\"] = coarsened_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "id": "86144711",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "View of AnnData object with n_obs × n_vars = 231752 × 31509\n",
       "    obs: 'nCount_RNA', 'nFeature_RNA', 'cell_name', 'cell_barcode_10x', 'sequencing_run_10x', 'channel_10x', 'possibly_contaminated_barcode_10x', 'method', 'individual', 'age', 'sex', 'tissue', 'tissue_system', 'tissue_order', 'subtissue', 'compartment_v1', 'cell_ontology_class_v1', 'free_annotation_v1', 'tissue__cell_ontology_class_v1', 'tissue__free_annotation_v1', 'mix_hybrid', 'low_quality', 'dendrogram_annotation_number', 'dendrogram_annotation_order', 'order__compartment_freeannotation_tissue', 'order__tissue_compartment_freeannotation', 'Mimu_168', 'Mimu_W03', 'Mimu_W04', 'Mimu_180ps', 'Mimu_191', 'Mimu_202', 'Mimu_208', 'Mimu_218', 'Mimu_229ps', 'Mimu_239ps', 'Mimu_249', 'Mimu_DMA', 'Mimu_DMB', 'Mimu_DPA', 'Mimu_DPB', 'Mimu_DQA', 'Mimu_DQB', 'Mimu_DRA', 'Mimu_DRB', 'MHC_C_I', 'MHC_NC_I', 'MHC_all_II', 'nMimu_168', 'nMimu_W03', 'nMimu_W04', 'nMimu_180ps', 'nMimu_191', 'nMimu_202', 'nMimu_208', 'nMimu_218', 'nMimu_229ps', 'nMimu_239ps', 'nMimu_249', 'nMimu_DMA', 'nMimu_DMB', 'nMimu_DPA', 'nMimu_DPB', 'nMimu_DQA', 'nMimu_DQB', 'nMimu_DRA', 'nMimu_DRB', 'nMHC_C_I', 'nMHC_NC_I', 'nMHC_all_II', 'cell_ontology_id', 'coarse_cell_id', 'coarse_cell_type'\n",
       "    var: 'name', 'highly_variable'\n",
       "    uns: 'compartment_update_colors'\n",
       "    obsm: 'X_pca', 'X_umap'\n",
       "    layers: 'raw_counts'"
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "if ten_x_subset:\n",
    "    lemur = lemur[lemur.obs[\"method\"] == \"10x\"]\n",
    "lemur"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "id": "0df9864b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/lfs/ampere2/0/yanay/lib/python3.8/site-packages/scanpy/preprocessing/_simple.py:249: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.\n",
      "  adata.var['n_counts'] = number\n"
     ]
    }
   ],
   "source": [
    "sc.pp.filter_genes(lemur, min_counts=500)\n",
    "sc.pp.filter_cells(lemur, min_counts=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "id": "b10f3cc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur.obs[\"tissue_type\"] = [lemur_tissue_map[t] for t in lemur.obs[\"tissue\"]]\n",
    "\n",
    "if tissue_subset:\n",
    "    lemur = lemur[lemur.obs[\"tissue_type\"].isin(keep_tissues)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "id": "1234501a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<>:2: SyntaxWarning: \"is not\" with a literal. Did you mean \"!=\"?\n",
      "<>:2: SyntaxWarning: \"is not\" with a literal. Did you mean \"!=\"?\n",
      "/tmp/user/21290/ipykernel_571139/3972575227.py:2: SyntaxWarning: \"is not\" with a literal. Did you mean \"!=\"?\n",
      "  if cell_type_number_filter is not 0:\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "36\n"
     ]
    }
   ],
   "source": [
    "# filter to large cell types\n",
    "if cell_type_number_filter is not 0:\n",
    "    lemur_keep_cell_types = lemur.obs[\"cell_ontology_class_v1\"].value_counts()[np.where(lemur.obs[\"cell_ontology_class_v1\"].value_counts() > cell_type_number_filter)[0]].index\n",
    "    lemur = lemur[lemur.obs[\"cell_ontology_class_v1\"].isin(lemur_keep_cell_types)]\n",
    "\n",
    "    print(len(lemur_keep_cell_types))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "id": "dd50e3d1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cell_ontology_class_v1</th>\n",
       "      <th>coarse_cell_type</th>\n",
       "      <th>tissue_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>L2_Pancreas_10X_AAACCTGAGAGACGAA</th>\n",
       "      <td>pancreatic acinar cell</td>\n",
       "      <td>epithelial cell</td>\n",
       "      <td>pancreas</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L2_Pancreas_10X_AAACCTGAGGATATAC</th>\n",
       "      <td>macrophage</td>\n",
       "      <td>phagocyte</td>\n",
       "      <td>pancreas</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L2_Pancreas_10X_AAACCTGCAGTATGCT</th>\n",
       "      <td>neutrophil</td>\n",
       "      <td>hematopoietic cell</td>\n",
       "      <td>pancreas</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L2_Pancreas_10X_AAACCTGCATGGTCAT</th>\n",
       "      <td>neutrophil</td>\n",
       "      <td>hematopoietic cell</td>\n",
       "      <td>pancreas</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L2_Pancreas_10X_AAACCTGCATTGGCGC</th>\n",
       "      <td>capillary endothelial cell</td>\n",
       "      <td>epithelial cell</td>\n",
       "      <td>pancreas</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L4_Spleen_10X_TTTGTCAGTACCGGCT</th>\n",
       "      <td>B cell</td>\n",
       "      <td>leukocyte</td>\n",
       "      <td>spleen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L4_Spleen_10X_TTTGTCAGTCAGAAGC</th>\n",
       "      <td>neutrophil</td>\n",
       "      <td>hematopoietic cell</td>\n",
       "      <td>spleen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L4_Spleen_10X_TTTGTCAGTTGAGGTG</th>\n",
       "      <td>unassigned</td>\n",
       "      <td>secretory cell</td>\n",
       "      <td>spleen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L4_Spleen_10X_TTTGTCATCGAATGCT</th>\n",
       "      <td>B cell</td>\n",
       "      <td>leukocyte</td>\n",
       "      <td>spleen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>L4_Spleen_10X_TTTGTCATCTACTCAT</th>\n",
       "      <td>B cell</td>\n",
       "      <td>leukocyte</td>\n",
       "      <td>spleen</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>104705 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      cell_ontology_class_v1  \\\n",
       "L2_Pancreas_10X_AAACCTGAGAGACGAA      pancreatic acinar cell   \n",
       "L2_Pancreas_10X_AAACCTGAGGATATAC                  macrophage   \n",
       "L2_Pancreas_10X_AAACCTGCAGTATGCT                  neutrophil   \n",
       "L2_Pancreas_10X_AAACCTGCATGGTCAT                  neutrophil   \n",
       "L2_Pancreas_10X_AAACCTGCATTGGCGC  capillary endothelial cell   \n",
       "...                                                      ...   \n",
       "L4_Spleen_10X_TTTGTCAGTACCGGCT                        B cell   \n",
       "L4_Spleen_10X_TTTGTCAGTCAGAAGC                    neutrophil   \n",
       "L4_Spleen_10X_TTTGTCAGTTGAGGTG                    unassigned   \n",
       "L4_Spleen_10X_TTTGTCATCGAATGCT                        B cell   \n",
       "L4_Spleen_10X_TTTGTCATCTACTCAT                        B cell   \n",
       "\n",
       "                                    coarse_cell_type tissue_type  \n",
       "L2_Pancreas_10X_AAACCTGAGAGACGAA     epithelial cell    pancreas  \n",
       "L2_Pancreas_10X_AAACCTGAGGATATAC           phagocyte    pancreas  \n",
       "L2_Pancreas_10X_AAACCTGCAGTATGCT  hematopoietic cell    pancreas  \n",
       "L2_Pancreas_10X_AAACCTGCATGGTCAT  hematopoietic cell    pancreas  \n",
       "L2_Pancreas_10X_AAACCTGCATTGGCGC     epithelial cell    pancreas  \n",
       "...                                              ...         ...  \n",
       "L4_Spleen_10X_TTTGTCAGTACCGGCT             leukocyte      spleen  \n",
       "L4_Spleen_10X_TTTGTCAGTCAGAAGC    hematopoietic cell      spleen  \n",
       "L4_Spleen_10X_TTTGTCAGTTGAGGTG        secretory cell      spleen  \n",
       "L4_Spleen_10X_TTTGTCATCGAATGCT             leukocyte      spleen  \n",
       "L4_Spleen_10X_TTTGTCATCTACTCAT             leukocyte      spleen  \n",
       "\n",
       "[104705 rows x 3 columns]"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lemur.obs[[\"cell_ontology_class_v1\", \"coarse_cell_type\", \"tissue_type\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "id": "70eef36b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from copy import deepcopy\n",
    "lemur_subset = lemur\n",
    "lemur_subset.obs = lemur.obs[[\"cell_ontology_class_v1\", \"coarse_cell_type\", \"tissue_type\"]]\n",
    "lemur_subset.obs[\"cell_type\"] = lemur.obs[\"cell_ontology_class_v1\"]\n",
    "to_remove = [\"cell\", \"unassigned\"]\n",
    "lemur_subset = lemur_subset[~lemur_subset.obs[\"cell_ontology_class_v1\"].isin(to_remove)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "id": "972070ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur_susbet_raw = sc.AnnData(lemur_subset.layers[\"raw_counts\"].toarray())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "id": "2ef71078",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur_susbet_raw.obs = lemur_subset.obs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "id": "f50d1256",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur_susbet = lemur_susbet_raw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "id": "1ee3e7f0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19691"
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(lemur_subset.var_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "id": "246e285d",
   "metadata": {},
   "outputs": [],
   "source": [
    "del lemur_subset.raw # annoying subset rule for anndatas, to work with SAMap Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "id": "db2ff26d",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur_subset.write(f\"./data/mouse_lemur_ct{cell_type_number_filter}_tissue{tissue_subset}_10x{ten_x_subset}.h5ad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "id": "3dbc9393",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemur = lemur_subset = None"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f2dc6422",
   "metadata": {},
   "source": [
    "# Tabule Muris"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "id": "ed4e5590",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./data/droplet/Marrow-10X_P7_3',\n",
       " './data/droplet/Kidney-10X_P7_5',\n",
       " './data/droplet/Kidney-10X_P4_6',\n",
       " './data/droplet/Lung-10X_P7_8',\n",
       " './data/droplet/Spleen-10X_P4_7',\n",
       " './data/droplet/Kidney-10X_P4_5',\n",
       " './data/droplet/Lung-10X_P8_12',\n",
       " './data/droplet/Trachea-10X_P8_15',\n",
       " './data/droplet/Limb_Muscle-10X_P7_14',\n",
       " './data/droplet/Tongue-10X_P4_1',\n",
       " './data/droplet/Mammary_Gland-10X_P7_13',\n",
       " './data/droplet/Liver-10X_P7_1',\n",
       " './data/droplet/Liver-10X_P4_2',\n",
       " './data/droplet/Bladder-10X_P4_4',\n",
       " './data/droplet/Bladder-10X_P7_7',\n",
       " './data/droplet/Lung-10X_P7_9',\n",
       " './data/droplet/Heart_and_Aorta-10X_P7_4',\n",
       " './data/droplet/Marrow-10X_P7_2',\n",
       " './data/droplet/Bladder-10X_P4_3',\n",
       " './data/droplet/Liver-10X_P7_0',\n",
       " './data/droplet/Spleen-10X_P7_6',\n",
       " './data/droplet/Trachea-10X_P8_14',\n",
       " './data/droplet/Limb_Muscle-10X_P7_15',\n",
       " './data/droplet/Tongue-10X_P4_0',\n",
       " './data/droplet/Mammary_Gland-10X_P7_12',\n",
       " './data/droplet/Tongue-10X_P7_10',\n",
       " './data/droplet/Lung-10X_P8_13',\n",
       " './data/droplet/Thymus-10X_P7_11']"
      ]
     },
     "execution_count": 174,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mouse_tissue_files = glob(\"./data/droplet/*\")\n",
    "mouse_tissue_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "id": "aaf5cad2",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "mouse_all_ads = []\n",
    "tissue_names = []\n",
    "for tissue_file in mouse_tissue_files:\n",
    "    t_ad = sc.read_10x_mtx(tissue_file)\n",
    "    mouse_all_ads.append(t_ad)\n",
    "    tissue_name = tissue_file.split(\"/\")[-1]\n",
    "    tissue_names.append(tissue_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "id": "fc0b0c2a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/lfs/ampere2/0/yanay/lib/python3.8/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n",
      "  utils.warn_names_duplicates(\"obs\")\n"
     ]
    }
   ],
   "source": [
    "mouse_all_tissues = sc.concat(mouse_all_ads, label=\"tissue\", keys=tissue_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "id": "33a0aa2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/lfs/ampere2/0/yanay/lib/python3.8/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n",
      "  utils.warn_names_duplicates(\"obs\")\n",
      "/lfs/ampere2/0/yanay/lib/python3.8/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n",
      "  utils.warn_names_duplicates(\"obs\")\n"
     ]
    }
   ],
   "source": [
    "sc.pp.filter_genes(mouse_all_tissues, min_counts=500)\n",
    "sc.pp.filter_cells(mouse_all_tissues, min_counts=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "id": "2fe7e7b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "barcodes = pd.Series(mouse_all_tissues.obs_names).str.split(\"-\", expand=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "id": "55b60442",
   "metadata": {},
   "outputs": [],
   "source": [
    "tissue_ids = mouse_all_tissues.obs[\"tissue\"].str.split(\"-\", expand=True)[1]\n",
    "new_obs_names = tissue_ids.reset_index()[1].str.cat(barcodes, sep=\"_\")\n",
    "mouse_all_tissues.obs_names = new_obs_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "id": "9634c719",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/user/21290/ipykernel_571139/2645208636.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  mouse_annot = pd.read_csv(\"./data/muris_annotations_droplet.csv\").set_index(\"cell\")\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cell_ontology_class</th>\n",
       "      <th>cell_ontology_id</th>\n",
       "      <th>channel</th>\n",
       "      <th>cluster.ids</th>\n",
       "      <th>free_annotation</th>\n",
       "      <th>mouse.id</th>\n",
       "      <th>mouse.sex</th>\n",
       "      <th>subsetA</th>\n",
       "      <th>subsetA_cluster.ids</th>\n",
       "      <th>subsetB</th>\n",
       "      <th>subsetB_cluster.ids</th>\n",
       "      <th>subtissue</th>\n",
       "      <th>tissue</th>\n",
       "      <th>tissue_tSNE_1</th>\n",
       "      <th>tissue_tSNE_2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cell</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10X_P7_8_AAACGGGAGGATATAC</th>\n",
       "      <td>myeloid cell</td>\n",
       "      <td>CL:0000763</td>\n",
       "      <td>10X_P7_8</td>\n",
       "      <td>20</td>\n",
       "      <td>dendritic cells and interstital macrophages</td>\n",
       "      <td>3-F-56</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Lung</td>\n",
       "      <td>17.024721</td>\n",
       "      <td>-32.902836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10X_P7_8_AAACGGGTCTCGTATT</th>\n",
       "      <td>alveolar macrophage</td>\n",
       "      <td>CL:0000583</td>\n",
       "      <td>10X_P7_8</td>\n",
       "      <td>5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3-F-56</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Lung</td>\n",
       "      <td>25.160619</td>\n",
       "      <td>25.066566</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10X_P7_8_AAAGATGCAGATCTGT</th>\n",
       "      <td>B cell</td>\n",
       "      <td>CL:0000236</td>\n",
       "      <td>10X_P7_8</td>\n",
       "      <td>12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3-F-56</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Lung</td>\n",
       "      <td>1.740567</td>\n",
       "      <td>46.488878</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10X_P7_8_AAATGCCAGATAGTCA</th>\n",
       "      <td>natural killer cell</td>\n",
       "      <td>CL:0000623</td>\n",
       "      <td>10X_P7_8</td>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3-F-56</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Lung</td>\n",
       "      <td>-31.647934</td>\n",
       "      <td>-2.208061</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10X_P7_8_AAATGCCCAAACTGCT</th>\n",
       "      <td>T cell</td>\n",
       "      <td>CL:0000084</td>\n",
       "      <td>10X_P7_8</td>\n",
       "      <td>21</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3-F-56</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Lung</td>\n",
       "      <td>-37.281266</td>\n",
       "      <td>-5.619565</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10X_P7_15_TTTGTCAAGCCAGAAC</th>\n",
       "      <td>endothelial cell</td>\n",
       "      <td>CL:0000115</td>\n",
       "      <td>10X_P7_15</td>\n",
       "      <td>4</td>\n",
       "      <td>endothelial cell</td>\n",
       "      <td>3-F-57</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Limb_Muscle</td>\n",
       "      <td>21.778547</td>\n",
       "      <td>-15.239181</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10X_P7_15_TTTGTCACAGCCTTGG</th>\n",
       "      <td>endothelial cell</td>\n",
       "      <td>CL:0000115</td>\n",
       "      <td>10X_P7_15</td>\n",
       "      <td>10</td>\n",
       "      <td>endothelial cell</td>\n",
       "      <td>3-F-57</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Limb_Muscle</td>\n",
       "      <td>37.977851</td>\n",
       "      <td>-10.079247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10X_P7_15_TTTGTCAGTAAGGGCT</th>\n",
       "      <td>mesenchymal stem cell</td>\n",
       "      <td>CL:0000134</td>\n",
       "      <td>10X_P7_15</td>\n",
       "      <td>9</td>\n",
       "      <td>mesenchymal stem cell</td>\n",
       "      <td>3-F-57</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Limb_Muscle</td>\n",
       "      <td>-27.254255</td>\n",
       "      <td>-10.505882</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10X_P7_15_TTTGTCAGTCTCCACT</th>\n",
       "      <td>mesenchymal stem cell</td>\n",
       "      <td>CL:0000134</td>\n",
       "      <td>10X_P7_15</td>\n",
       "      <td>8</td>\n",
       "      <td>mesenchymal stem cell</td>\n",
       "      <td>3-F-57</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Limb_Muscle</td>\n",
       "      <td>-15.538574</td>\n",
       "      <td>-4.647427</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10X_P7_15_TTTGTCAGTTGTGGAG</th>\n",
       "      <td>skeletal muscle satellite cell</td>\n",
       "      <td>CL:0000594</td>\n",
       "      <td>10X_P7_15</td>\n",
       "      <td>5</td>\n",
       "      <td>skeletal muscle satellite cell</td>\n",
       "      <td>3-F-57</td>\n",
       "      <td>F</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Limb_Muscle</td>\n",
       "      <td>11.749277</td>\n",
       "      <td>11.218774</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>55656 rows × 15 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       cell_ontology_class cell_ontology_id  \\\n",
       "cell                                                                          \n",
       "10X_P7_8_AAACGGGAGGATATAC                     myeloid cell       CL:0000763   \n",
       "10X_P7_8_AAACGGGTCTCGTATT              alveolar macrophage       CL:0000583   \n",
       "10X_P7_8_AAAGATGCAGATCTGT                           B cell       CL:0000236   \n",
       "10X_P7_8_AAATGCCAGATAGTCA              natural killer cell       CL:0000623   \n",
       "10X_P7_8_AAATGCCCAAACTGCT                           T cell       CL:0000084   \n",
       "...                                                    ...              ...   \n",
       "10X_P7_15_TTTGTCAAGCCAGAAC                endothelial cell       CL:0000115   \n",
       "10X_P7_15_TTTGTCACAGCCTTGG                endothelial cell       CL:0000115   \n",
       "10X_P7_15_TTTGTCAGTAAGGGCT           mesenchymal stem cell       CL:0000134   \n",
       "10X_P7_15_TTTGTCAGTCTCCACT           mesenchymal stem cell       CL:0000134   \n",
       "10X_P7_15_TTTGTCAGTTGTGGAG  skeletal muscle satellite cell       CL:0000594   \n",
       "\n",
       "                              channel  cluster.ids  \\\n",
       "cell                                                 \n",
       "10X_P7_8_AAACGGGAGGATATAC    10X_P7_8           20   \n",
       "10X_P7_8_AAACGGGTCTCGTATT    10X_P7_8            5   \n",
       "10X_P7_8_AAAGATGCAGATCTGT    10X_P7_8           12   \n",
       "10X_P7_8_AAATGCCAGATAGTCA    10X_P7_8            7   \n",
       "10X_P7_8_AAATGCCCAAACTGCT    10X_P7_8           21   \n",
       "...                               ...          ...   \n",
       "10X_P7_15_TTTGTCAAGCCAGAAC  10X_P7_15            4   \n",
       "10X_P7_15_TTTGTCACAGCCTTGG  10X_P7_15           10   \n",
       "10X_P7_15_TTTGTCAGTAAGGGCT  10X_P7_15            9   \n",
       "10X_P7_15_TTTGTCAGTCTCCACT  10X_P7_15            8   \n",
       "10X_P7_15_TTTGTCAGTTGTGGAG  10X_P7_15            5   \n",
       "\n",
       "                                                        free_annotation  \\\n",
       "cell                                                                      \n",
       "10X_P7_8_AAACGGGAGGATATAC   dendritic cells and interstital macrophages   \n",
       "10X_P7_8_AAACGGGTCTCGTATT                                           NaN   \n",
       "10X_P7_8_AAAGATGCAGATCTGT                                           NaN   \n",
       "10X_P7_8_AAATGCCAGATAGTCA                                           NaN   \n",
       "10X_P7_8_AAATGCCCAAACTGCT                                           NaN   \n",
       "...                                                                 ...   \n",
       "10X_P7_15_TTTGTCAAGCCAGAAC                             endothelial cell   \n",
       "10X_P7_15_TTTGTCACAGCCTTGG                             endothelial cell   \n",
       "10X_P7_15_TTTGTCAGTAAGGGCT                        mesenchymal stem cell   \n",
       "10X_P7_15_TTTGTCAGTCTCCACT                        mesenchymal stem cell   \n",
       "10X_P7_15_TTTGTCAGTTGTGGAG               skeletal muscle satellite cell   \n",
       "\n",
       "                           mouse.id mouse.sex subsetA  subsetA_cluster.ids  \\\n",
       "cell                                                                         \n",
       "10X_P7_8_AAACGGGAGGATATAC    3-F-56         F     NaN                  NaN   \n",
       "10X_P7_8_AAACGGGTCTCGTATT    3-F-56         F     NaN                  NaN   \n",
       "10X_P7_8_AAAGATGCAGATCTGT    3-F-56         F     NaN                  NaN   \n",
       "10X_P7_8_AAATGCCAGATAGTCA    3-F-56         F     NaN                  NaN   \n",
       "10X_P7_8_AAATGCCCAAACTGCT    3-F-56         F     NaN                  NaN   \n",
       "...                             ...       ...     ...                  ...   \n",
       "10X_P7_15_TTTGTCAAGCCAGAAC   3-F-57         F     NaN                  NaN   \n",
       "10X_P7_15_TTTGTCACAGCCTTGG   3-F-57         F     NaN                  NaN   \n",
       "10X_P7_15_TTTGTCAGTAAGGGCT   3-F-57         F     NaN                  NaN   \n",
       "10X_P7_15_TTTGTCAGTCTCCACT   3-F-57         F     NaN                  NaN   \n",
       "10X_P7_15_TTTGTCAGTTGTGGAG   3-F-57         F     NaN                  NaN   \n",
       "\n",
       "                           subsetB  subsetB_cluster.ids subtissue  \\\n",
       "cell                                                                \n",
       "10X_P7_8_AAACGGGAGGATATAC      NaN                  NaN       NaN   \n",
       "10X_P7_8_AAACGGGTCTCGTATT      NaN                  NaN       NaN   \n",
       "10X_P7_8_AAAGATGCAGATCTGT      NaN                  NaN       NaN   \n",
       "10X_P7_8_AAATGCCAGATAGTCA      NaN                  NaN       NaN   \n",
       "10X_P7_8_AAATGCCCAAACTGCT      NaN                  NaN       NaN   \n",
       "...                            ...                  ...       ...   \n",
       "10X_P7_15_TTTGTCAAGCCAGAAC     NaN                  NaN       NaN   \n",
       "10X_P7_15_TTTGTCACAGCCTTGG     NaN                  NaN       NaN   \n",
       "10X_P7_15_TTTGTCAGTAAGGGCT     NaN                  NaN       NaN   \n",
       "10X_P7_15_TTTGTCAGTCTCCACT     NaN                  NaN       NaN   \n",
       "10X_P7_15_TTTGTCAGTTGTGGAG     NaN                  NaN       NaN   \n",
       "\n",
       "                                 tissue  tissue_tSNE_1  tissue_tSNE_2  \n",
       "cell                                                                   \n",
       "10X_P7_8_AAACGGGAGGATATAC          Lung      17.024721     -32.902836  \n",
       "10X_P7_8_AAACGGGTCTCGTATT          Lung      25.160619      25.066566  \n",
       "10X_P7_8_AAAGATGCAGATCTGT          Lung       1.740567      46.488878  \n",
       "10X_P7_8_AAATGCCAGATAGTCA          Lung     -31.647934      -2.208061  \n",
       "10X_P7_8_AAATGCCCAAACTGCT          Lung     -37.281266      -5.619565  \n",
       "...                                 ...            ...            ...  \n",
       "10X_P7_15_TTTGTCAAGCCAGAAC  Limb_Muscle      21.778547     -15.239181  \n",
       "10X_P7_15_TTTGTCACAGCCTTGG  Limb_Muscle      37.977851     -10.079247  \n",
       "10X_P7_15_TTTGTCAGTAAGGGCT  Limb_Muscle     -27.254255     -10.505882  \n",
       "10X_P7_15_TTTGTCAGTCTCCACT  Limb_Muscle     -15.538574      -4.647427  \n",
       "10X_P7_15_TTTGTCAGTTGTGGAG  Limb_Muscle      11.749277      11.218774  \n",
       "\n",
       "[55656 rows x 15 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "mouse_annot = pd.read_csv(\"./data/muris_annotations_droplet.csv\").set_index(\"cell\")\n",
    "display(mouse_annot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "id": "ded11a36",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "55652"
      ]
     },
     "execution_count": 183,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "keep_barcodes = set(np.unique(mouse_annot.index)).intersection(set(mouse_all_tissues.obs_names))\n",
    "len(keep_barcodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "id": "2a63a119",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/user/21290/ipykernel_571139/436733469.py:2: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  mouse.obs[\"cell_type\"] = mouse_annot[\"cell_ontology_class\"]\n"
     ]
    }
   ],
   "source": [
    "mouse = mouse_all_tissues[list(keep_barcodes), :]\n",
    "mouse.obs[\"cell_type\"] = mouse_annot[\"cell_ontology_class\"]\n",
    "mouse.obs[\"cell_ontology_id\"] = mouse_annot[\"cell_ontology_id\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "id": "174b99a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.filter_genes(mouse, min_counts=500)\n",
    "sc.pp.filter_cells(mouse, min_counts=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "id": "a1681d4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "mouse.obs[\"tissue_type\"] = [mouse_tissue_map[t.split(\"-\")[0]] for t in mouse.obs[\"tissue\"]]\n",
    "if tissue_subset:\n",
    "    mouse = mouse[mouse.obs[\"tissue_type\"].isin(keep_tissues)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "id": "9cc25690",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "24\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<>:2: SyntaxWarning: \"is not\" with a literal. Did you mean \"!=\"?\n",
      "<>:2: SyntaxWarning: \"is not\" with a literal. Did you mean \"!=\"?\n",
      "/tmp/user/21290/ipykernel_571139/3674833211.py:2: SyntaxWarning: \"is not\" with a literal. Did you mean \"!=\"?\n",
      "  if cell_type_number_filter is not 0:\n"
     ]
    }
   ],
   "source": [
    "# filter to large cell types\n",
    "if cell_type_number_filter is not 0:\n",
    "    mouse_keep_cell_types = mouse.obs[\"cell_type\"].value_counts()[np.where(mouse.obs[\"cell_type\"].value_counts() > cell_type_number_filter)[0]].index\n",
    "    mouse = mouse[mouse.obs[\"cell_type\"].isin(mouse_keep_cell_types)]\n",
    "    print(len(mouse_keep_cell_types))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "id": "e3d2e67b",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "coarsened_ids_dict, coarsened_names_dict = coarsen_labels(mouse.obs[\"cell_ontology_id\"], max_level=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "id": "f918ccfd",
   "metadata": {},
   "outputs": [],
   "source": [
    "coarsened_ids = [coarsened_ids_dict.get(ctid, ctid) for ctid in mouse.obs[\"cell_ontology_id\"]]\n",
    "coarsened_names = [coarsened_names_dict.get(ctid, ctname) for ctid, ctname in zip(mouse.obs[\"cell_ontology_id\"], mouse.obs[\"cell_type\"])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "id": "9bd4dbc3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/user/21290/ipykernel_571139/1827529850.py:1: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  mouse.obs[\"coarse_cell_id\"] = coarsened_ids\n"
     ]
    }
   ],
   "source": [
    "mouse.obs[\"coarse_cell_id\"] = coarsened_ids\n",
    "mouse.obs[\"coarse_cell_type\"] = coarsened_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "id": "66e52c07",
   "metadata": {},
   "outputs": [],
   "source": [
    "mouse.write(f\"./data/muris_ct{cell_type_number_filter}_tissue{tissue_subset}.h5ad\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "id": "c0906a0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "mouse = None"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
